WebDBWriter.java example

Explorer
damp.ekeko.snippets-master
- damp.ekeko.snippets.plugin
  - src
    - damp
      - ekeko
        snippets
        BoundDirective.java
        DirectiveOperandBinding.java
        EkekoSnippetsPlugin.java
        ExtractedSnippet.java
        NaiveASTFlattener.java
        OperatorOperandBinding.java
        SnippetBaseListener.java
        SnippetBaseVisitor.java
        SnippetExtractor.java
        SnippetLexer.java
        SnippetListener.java
        SnippetParser.java
        SnippetVisitor.java
        data
        SnippetOperator.java
        TemplateGroup.java
        geneticsearch
        PartialJavaProjectModel.java
        gui
        BoundDirectivesEditorDialog.java
        BoundDirectivesViewer.java
        ChartCanvas.java
        ClojureFileEditorInput.java
        DirectiveOperandBindingEditingSupport.java
        DirectiveOperandBindingLabelProviderValue.java
        DirectiveSelectionDialog.java
        IntendedResultsEditor.java
        IntendedResultsEditorCommandHandler.java
        IntendedResultsEditorInput.java
        IntendedResultsEditorPersistableElementFactory.java
        MutationHistoryDialog.java
        OperandBindingLabelProviderDescription.java
        OperatorOperandBindingEditingSupport.java
        OperatorOperandBindingLabelProviderValue.java
        OperatorOperandsView.java
        OperatorOperandsViewer.java
        OperatorTreeContentProvider.java
        OperatorTreeLabelProvider.java
        PopulationInspectorDialog.java
        QueryInspectorDialog.java
        RecommendationEditor.java
        RecommendationEditorCommandHandler.java
        RecommendationEditorInput.java
        RecommendationEditorPersistableElementFactory.java
        RewritesTemplateEditor.java
        SubjectsTemplateEditor.java
        TemplateCodeGenerator.java
        TemplateEditor.java
        TemplateEditorActionBarContributor.java
        TemplateEditorCommandHandler.java
        TemplateEditorInput.java
        TemplateEditorPersistableElementFactory.java
        TemplateGroupNodeSelectionDialog.java
        TemplateGroupTemplateElement.java
        TemplateGroupViewer.java
        TemplateGroupViewerNodeDoubleClickListener.java
        TemplateGroupViewerNodeSelectionEvent.java
        TemplateGroupViewerNodeSelectionListener.java
        TemplatePrettyPrinter.java
        TemplateTreeContentProvider.java
        TemplateTreeLabelProviders.java
        TransformationEditor.java
        TransformationEditorActionBarContributor.java
        TransformationEditorCommandHandler.java
        TransformationEditorInput.java
        TransformationEditorPersistableElementFactory.java
        TransformationOverviewEditor.java
    - ec
      - util
        MersenneTwister.java
- damp.ekeko.snippets.plugin.test
  - resources
  - src
    - test
      - damp
        ekeko
        snippets
        EkekoSnippetsTest.java
        experiments
        GeneticSearchTest.java
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.db;

import java.io.*;
import java.util.*;
import java.util.logging.*;
import java.nio.channels.*;

import net.nutch.io.*;
import net.nutch.util.*;
import net.nutch.pagedb.*;
import net.nutch.linkdb.*;

/***************************************************
 * This is a wrapper class that allows us to reorder
 * write operations to the linkdb and pagedb.  It is
 * useful only for objects like UpdateDatabaseTool,
 * which just does writes.
 *
 * The WebDBWriter is a traditional single-pass database writer.
 * It does not cache any instructions to disk (but it does
 * in memory, with possible resorting).  It certainly does
 * nothing in a distributed fashion.
 *
 * There are other implementors of IWebDBWriter that do
 * all that fancy stuff.
 *
 * @author Mike Cafarella
 *************************************************/
public class WebDBWriter implements IWebDBWriter {
    static final Logger LOG = LogFormatter.getLogger("net.nutch.db.WebDBWriter");
    static final byte CUR_VERSION = 0;

    // db opcodes
    static final byte ADD_PAGE = 0;
    static final byte ADD_PAGE_WITH_SCORE = 1;
    static final byte ADD_PAGE_IFN_PRESENT = 2;
    static final byte DEL_PAGE = 3;
    static final int ADD_LINK = 0;
    static final int DEL_LINK = 1;
    static final int DEL_SINGLE_LINK = 2;

    // filenames
    static final String PAGES_BY_URL = "pagesByURL";
    static final String PAGES_BY_MD5 = "pagesByMD5";
    static final String LINKS_BY_URL = "linksByURL";
    static final String LINKS_BY_MD5 = "linksByMD5";
    static final String STATS_FILE = "stats";

    // Result codes for page-url comparisons
    static final int NO_OUTLINKS = 0;
    static final int HAS_OUTLINKS = 1;
    static final int LINK_INVALID = 2;

    /********************************************
     * PageInstruction holds an operation over a Page.
     *********************************************/
    public static class PageInstruction implements WritableComparable {
        byte opcode;
        boolean hasLink;
        Page page;
        Link link;

        /**
         */
        public PageInstruction() {}

        /**
         */
        public PageInstruction(Page page, int opcode) {
            set(page, opcode);
        }

        /**
         */
        public PageInstruction(Page page, Link link, int opcode) {
            set(page, link, opcode);
        }

        /**
         * Init from another PageInstruction object.
         */
        public void set(PageInstruction that) {
            this.opcode = that.opcode;

            if (this.page == null) {
                this.page = new Page();
            }
            this.page.set(that.page);

            if (this.link == null) {
                this.link = new Link();
            }
            this.hasLink = that.hasLink;
            if (this.hasLink) {
                this.link.set(that.link);
            }
        }

        /**
         * Init PageInstruction with no Link
         */
        public void set(Page page, int opcode) {
            this.opcode = (byte) opcode;
            this.page = page;
            this.hasLink = false;
            this.link = null;
        }

        /**
         * Init PageInstruction with a Link
         */         
        public void set(Page page, Link link, int opcode) {
            this.opcode = (byte) opcode;
            this.page = page;
            this.hasLink = true;
            this.link = link;
        }

        //
        // WritableComparable
        //
        public int compareTo(Object o) {
            int pageResult = this.page.compareTo(((PageInstruction) o).page);
            if (pageResult != 0) {
                return pageResult;
            } else {
                return this.opcode - (((PageInstruction) o).opcode);
            }
        }
        public void write(DataOutput out) throws IOException {
            out.writeByte(opcode);
            page.write(out);
            out.writeByte(hasLink ? 1 : 0);
            if (hasLink) {
                link.write(out);
            }
        }
        public void readFields(DataInput in) throws IOException {
            opcode = in.readByte();
            if (page == null) {
                page = new Page();
            }
            page.readFields(in);
            
            if (link == null) {
                link = new Link();
            }
            hasLink = (1 == in.readByte());
            if (hasLink) {
                link.readFields(in);
            }
        }
        public Page getPage() {
            return page;
        }
        public Link getLink() {
            if (hasLink) {
                return link;
            } else {
                return null;
            }
        }
        public int getInstruction() {
            return opcode;
        }

        /**
         * Sorts the instruction first by Page, then by opcode.
         */
        public static class PageComparator extends WritableComparator {
            private static final Page.Comparator PAGE_COMPARATOR =
            new Page.Comparator();

            public PageComparator() { super(PageInstruction.class); }

            /** Optimized comparator. */
            public int compare(byte[] b1, int s1, int l1,
                               byte[] b2, int s2, int l2) {
                int opcode1 = b1[s1];
                int opcode2 = b2[s2];
                int c = PAGE_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
                if (c != 0)
                    return c;
                return opcode1 - opcode2;
            }
        }
 
        /*****************************************************
         * Sorts the instruction first by url, then by opcode.
         *****************************************************/
        public static class UrlComparator extends WritableComparator {
            private static final Page.UrlComparator PAGE_COMPARATOR =
            new Page.UrlComparator();

            public UrlComparator() { super(PageInstruction.class); }

            /**
             * We need to sort by ordered URLs.  First, we sort by
             * URL, then by opcode.
             */
            public int compare(WritableComparable a, WritableComparable b) {
                PageInstruction instructionA = (PageInstruction)a;
                PageInstruction instructionB = (PageInstruction)b;
                Page pageA = instructionA.getPage();
                Page pageB = instructionB.getPage();

                int result = pageA.getURL().compareTo(pageB.getURL());
                if (result != 0) {
                    return result;
                } else {
                    return instructionA.opcode - instructionB.opcode;
                }
            }

            /** 
             * Optimized comparator. 
             */
            public int compare(byte[] b1, int s1, int l1,
                               byte[] b2, int s2, int l2) {
                int opcode1 = b1[s1];
                int opcode2 = b2[s2];
                int c = PAGE_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
                if (c != 0)
                    return c;
                return opcode1 - opcode2;
            }
        }
    }

    /********************************************************
     * PageInstructionWriter very efficiently writes a 
     * PageInstruction to a SequenceFile.Writer.  Much better
     * than calling "writer.append(new PageInstruction())"
     ********************************************************/
    public static class PageInstructionWriter {
        PageInstruction pi = new PageInstruction();

        /**
         */
        public PageInstructionWriter() {
        }

        /**
         * Append the PageInstruction info to the indicated SequenceFile,
         * and keep the PI for later reuse.
         */
        public synchronized void appendInstructionInfo(SequenceFile.Writer writer, Page page, int opcode, Writable val) throws IOException {
            pi.set(page, opcode);
            writer.append(pi, val);
        }

        /**
         * Append the PageInstruction info to the indicated SequenceFile,
         * and keep the PI for later reuse.
         */
        public synchronized void appendInstructionInfo(SequenceFile.Writer writer, Page page, Link link, int opcode, Writable val) throws IOException {
            pi.set(page, link, opcode);
            writer.append(pi, val);
        }
    }

    /*************************************************************
     * Reduce multiple instructions for a given url to the single effective
     * instruction.  ADD is prioritized highest, then ADD_IFN_PRESENT, and then
     * DEL.  Not coincidentally, this is opposite the order they're sorted in.
     **************************************************************/
    private static class DeduplicatingPageSequenceReader {
        SequenceFile.Reader edits;
        PageInstruction current = new PageInstruction();
        UTF8 currentUrl = new UTF8();
        boolean haveCurrent;

        /**
         */
        public DeduplicatingPageSequenceReader(SequenceFile.Reader edits) throws IOException {
            this.edits = edits;
            this.haveCurrent = edits.next(current, NullWritable.get());
        }

        /**
         */
        public boolean next(PageInstruction result) throws IOException {
            if (!haveCurrent) {
                return false;
            }
        
            currentUrl.set(current.getPage().getURL());
            result.set(current); // take the first instruction

            do {
                // skip the rest
            } while ((haveCurrent = edits.next(current, NullWritable.get())) &&
                     currentUrl.compareTo(current.getPage().getURL()) == 0);
            return true;
        }
    }


    /*************************************************
     * Holds an instruction over a Link.
     *************************************************/
    public static class LinkInstruction implements WritableComparable {
        Link link;
        int instruction;

        /**
         */
        public LinkInstruction() {
        }

        /**
         */
        public LinkInstruction(Link link, int instruction) {
            set(link, instruction);
        }

        /**
         * Re-init from another LinkInstruction's info.
         */
        public void set(LinkInstruction that) {
            this.instruction = that.instruction;
          
            if (this.link == null)
                this.link = new Link();

            this.link.set(that.link);
        }

        /**
         * Re-init with a Link and an instruction
         */
        public void set(Link link, int instruction) {
            this.link = link;
            this.instruction = instruction;
        }

        //
        // WritableComparable
        //
        public int compareTo(Object o) {
            return this.link.compareTo(((LinkInstruction) o).link);
        }
        public void write(DataOutput out) throws IOException {
            out.writeByte(instruction);
            link.write(out);
        }
        public void readFields(DataInput in) throws IOException {
            this.instruction = in.readByte();
            if (link == null)
                link = new Link();
            link.readFields(in);
        }
        public Link getLink() {
            return link;
        }
        public int getInstruction() {
            return instruction;
        }

        /*******************************************************
         * Sorts the instruction first by Md5, then by opcode.
         *******************************************************/
        public static class MD5Comparator extends WritableComparator {
            private static final Link.MD5Comparator MD5_COMPARATOR =
            new Link.MD5Comparator();

            public MD5Comparator() { super(LinkInstruction.class); }

            public int compare(WritableComparable a, WritableComparable b) {
                LinkInstruction instructionA = (LinkInstruction)a;
                LinkInstruction instructionB = (LinkInstruction)b;
                return instructionA.link.md5Compare(instructionB.link);
            }

            /** Optimized comparator. */
            public int compare(byte[] b1, int s1, int l1,
                               byte[] b2, int s2, int l2) {
                return MD5_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
            }
        }
 
        /*********************************************************
         * Sorts the instruction first by url, then by opcode.
         *********************************************************/
        public static class UrlComparator extends WritableComparator {
            private static final Link.UrlComparator URL_COMPARATOR =
            new Link.UrlComparator();

            public UrlComparator() { super(LinkInstruction.class); }

            public int compare(WritableComparable a, WritableComparable b) {
                LinkInstruction instructionA = (LinkInstruction)a;
                LinkInstruction instructionB = (LinkInstruction)b;
                return instructionA.link.urlCompare(instructionB.link);

            }

            /** 
             * Optimized comparator. 
             */
            public int compare(byte[] b1, int s1, int l1,
                               byte[] b2, int s2, int l2) {
                return URL_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
            }
        }
    }

    /*******************************************************
     * LinkInstructionWriter very efficiently writes a
     * LinkInstruction to a SequenceFile.Writer.  Much better
     * than calling "writer.append(new LinkInstruction())"
     ********************************************************/
    public static class LinkInstructionWriter {
        LinkInstruction li = new LinkInstruction();

        /**
         */
        public LinkInstructionWriter() {
        }

        /**
         * Append the LinkInstruction info to the indicated SequenceFile
         * and keep the LI for later reuse.
         */
        public synchronized void appendInstructionInfo(SequenceFile.Writer writer, Link link, int opcode, Writable val) throws IOException {
            li.set(link, opcode);
            writer.append(li, val);
        }
    }

    /********************************************************
     * This class deduplicates link operations.  We want to 
     * sort by MD5, then by URL.  But all operations
     * should be unique.
     *********************************************************/
    class DeduplicatingLinkSequenceReader {
        Link currentKey = new Link();
        LinkInstruction current = new LinkInstruction();
        SequenceFile.Reader edits;
        boolean haveCurrent;

        /**
         */
        public DeduplicatingLinkSequenceReader(SequenceFile.Reader edits) throws IOException {
            this.edits = edits;
            this.haveCurrent = edits.next(current, NullWritable.get());
        }


        /**
         * The incoming stream of edits is sorted first by MD5, then by URL.
         * MD5-only values always come before MD5+URL.
         */
        public boolean next(LinkInstruction key) throws IOException {
            if (! haveCurrent) {
                return false;
            }

            currentKey.set(current.getLink());
            
            do {
                key.set(current);
            } while ((haveCurrent = edits.next(current, NullWritable.get())) &&
                     currentKey.compareTo(current.getLink()) == 0);
            return true;
        }
    }


    /**************************************************
     * The CloseProcessor class is used when we close down
     * the webdb.  We give it the path, members, and class values
     * needed to apply changes to any of our 4 data tables.
     * 
     * This is an abstract class.  Each subclass must define
     * the exact merge procedure.  However, file-handling
     * and edit-processing is standardized as much as possible.
     *
     **************************************************/
    private abstract class CloseProcessor {
        String basename;
        MapFile.Reader oldDb;
        SequenceFile.Writer editWriter;
        SequenceFile.Sorter sorter;
        WritableComparator comparator;
        Class keyClass, valueClass;
        long itemsWritten = 0;

        /**
         * Store away these members for later use.
         */
        CloseProcessor(String basename, MapFile.Reader oldDb, SequenceFile.Writer editWriter, SequenceFile.Sorter sorter, WritableComparator comparator, Class keyClass, Class valueClass) {
            this.basename = basename;
            this.oldDb = oldDb;
            this.editWriter = editWriter;
            this.sorter = sorter;
            this.comparator = comparator;
            this.keyClass = keyClass;
            this.valueClass = valueClass;
        }

        /**
         * Perform the shutdown sequence for this Processor.
         * There is a lot of file-moving and edit-sorting that
         * is common across all the 4 tables.
         *
         * Returns how many items were written out by this close().
         */
        long closeDown(File workingDir, File outputDir, long numEdits) throws IOException {
            File editsFile = new File(workingDir, basename + ".out");
            File newDbFile = new File(outputDir, basename);
            File sortedEditsFile = new File(editsFile.getPath() + ".sorted");
            editWriter.close();

            // If there are edits, then process them.
            if (numEdits != 0) {
                // Sort the edits
                long startSort = System.currentTimeMillis();
                sorter.sort(editsFile.getPath(), sortedEditsFile.getPath());
                long endSort = System.currentTimeMillis();
                LOG.info("Processing " + basename + ": Sorted " + numEdits + " instructions in " + ((endSort - startSort) / 1000.0) + " seconds.");
                LOG.info("Processing " + basename + ": Sorted " + (numEdits / ((endSort - startSort) / 1000.0)) + " instructions/second");
            
                // Rename appropriately
                editsFile.delete();
                sortedEditsFile.renameTo(editsFile);

                // Read the sorted edits
                SequenceFile.Reader sortedEdits = new SequenceFile.Reader(editsFile.getPath());

                // Create a brand-new output db for the integrated data
                MapFile.Writer newDb = (comparator == null) ? new MapFile.Writer(newDbFile.getPath(), keyClass, valueClass) : new MapFile.Writer(newDbFile.getPath(), comparator, valueClass);

                // Iterate through the edits, and merge changes with existing
                // db into the brand-new file
                oldDb.reset();
            
                // Merge the edits.  We did it!
                long startMerge = System.currentTimeMillis();
                mergeEdits(oldDb, sortedEdits, newDb);
                long endMerge = System.currentTimeMillis();
                LOG.info("Processing " + basename + ": Merged to new DB containing " + itemsWritten + " records in " + ((endMerge - startMerge) / 1000.0) + " seconds");
                LOG.info("Processing " + basename + ": Merged " + (itemsWritten / ((endMerge - startMerge) / 1000.0)) + " records/second");

                // Close down readers, writers
                sortedEdits.close();
                newDb.close();
            } else {
                // Otherwise, simply copy the file into place,
                // without all the processing overhead.
                long startCopy = System.currentTimeMillis();
                File curFile = new File(dbFile, basename);
                FileUtil.recursiveCopy(curFile, newDbFile);
                long endCopy = System.currentTimeMillis();

                LOG.info("Processing " + basename + ": Copied file (" + newDbFile.length()+ " bytes) in " + ((endCopy - startCopy) / 1000.0) + " secs.");
            }

            // Delete the now-consumed edits file to save space
            editsFile.delete();

            return itemsWritten;
        }

        /**
         * The loop that actually applies the changes and writes to
         * a new db.  This is different for every subclass!
         */
        abstract void mergeEdits(MapFile.Reader db, SequenceFile.Reader edits, MapFile.Writer newDb) throws IOException;
    }

    /***
     * The PagesByURLProcessor is used during close() time for
     * the pagesByURL table.  We instantiate one of these, and it
     * takes care of the entire shutdown process.
     */
    private class PagesByURLProcessor extends CloseProcessor {
        SequenceFile.Writer futureEdits;

        /**
         * We store "futureEdits" so we can write edits for the
         * next table-db step
         */
        PagesByURLProcessor(MapFile.Reader db, SequenceFile.Writer editWriter, SequenceFile.Writer futureEdits) {
            super(PAGES_BY_URL, db, editWriter, new SequenceFile.Sorter(new PageInstruction.UrlComparator(), NullWritable.class), new UTF8.Comparator(), null, Page.class);
            this.futureEdits = futureEdits;
        }

        /**
         * Merge the existing db with the edit-stream into a brand-new file.
         */
        void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException {
            // Create the keys and vals we'll be using
            DeduplicatingPageSequenceReader edits = new DeduplicatingPageSequenceReader(sortedEdits);
            WritableComparable readerKey = new UTF8();
            Page readerVal = new Page();
            PageInstruction editItem = new PageInstruction();
            int futureOrdering = 0;

            // Read the first items from both streams
            boolean hasEntries = db.next(readerKey, readerVal);
            boolean hasEdits = edits.next(editItem);

            // As long as we have both edits and entries, we need to
            // interleave them....
            while (hasEntries && hasEdits) {
                int comparison = readerKey.compareTo(editItem.getPage().getURL());
                int curInstruction = editItem.getInstruction();

                // Perform operations
                if ((curInstruction == ADD_PAGE) ||
                    (curInstruction == ADD_PAGE_WITH_SCORE) ||
                    (curInstruction == ADD_PAGE_IFN_PRESENT)) {

                    if (comparison < 0) {
                        // Write readerKey, just passing it along.
                        // Don't process the edit yet.
                        newDb.append(readerKey, readerVal);
                        itemsWritten++;
                        hasEntries = db.next(readerKey, readerVal);
                    } else if (comparison == 0) {
                        // The keys are equal.  If the instruction 
                        // is ADD_PAGE, we write the edit's key and 
                        // replace the old one.
                        //
                        // Otherwise, if it's ADD_IFN_PRESENT, 
                        // keep the reader's item intact.
                        //
                        if ((curInstruction == ADD_PAGE) ||
                            (curInstruction == ADD_PAGE_WITH_SCORE)) {
                            // An ADD_PAGE with an identical pair
                            // of pages replaces the existing one.
                            // We may need to note the fact for
                            // Garbage Collection.
                            //
                            // This happens in three stages.  
                            // 1.  We write necessary items to the future
                            //     edits-list.
                            //
                            pagesByMD5Edits++;

                            // If this is a replacing add, we don't want
                            // to disturb the score from the old Page!  This,
                            // way, we can run some link analysis scoring
                            // while the new Pages are being fetched and
                            // not lose the info when a Page is replaced.
                            //
                            // If it is an ADD_PAGE_WITH_SCORE, then we 
                            // go ahead and replace the old one.
                            //
                            // Either way, from now on we treat it
                            // as an ADD_PAGE
                            //
                            Page editItemPage = editItem.getPage();

                            if (curInstruction == ADD_PAGE) {
                                editItemPage.setScore(readerVal.getScore(), readerVal.getNextScore());
                            }

                            piwriter.appendInstructionInfo(futureEdits, editItemPage, ADD_PAGE, NullWritable.get());

                            //
                            // 2.  We write the edit-page to *this* table.
                            //
                            newDb.append(editItemPage.getURL(), editItemPage);

                            //
                            // 3.  We want the ADD in the next step (the
                            //     MD5-driven table) to be a "replacing add".
                            //     But that won't happen if the readerItem and
                            //     the editItem Pages are not identical.
                            //     (In this scenario, that means their URLs
                            //     are the same, but their MD5s are different.)
                            //     So, we need to explicitly handle that
                            //     case by issuing a DELETE for the now-obsolete
                            //     item.
                            if (editItemPage.compareTo(readerVal) != 0) {
                                pagesByMD5Edits++;
                                piwriter.appendInstructionInfo(futureEdits, readerVal, DEL_PAGE, NullWritable.get());
                            }

                            itemsWritten++;

                            // "Delete" the readerVal by skipping it.
                            hasEntries = db.next(readerKey, readerVal);
                        } else {
                            // ADD_PAGE_IFN_PRESENT.  We only add IF_NOT
                            // present.  And it was present!  So, we treat 
                            // this case like we treat a no-op.
                            // Just move to the next edit.
                        }
                        // In either case, we process the edit.
                        hasEdits = edits.next(editItem);

                    } else if (comparison > 0) {
                        // We have inserted a Page that's before some
                        // entry in the existing database.  So, we just
                        // need to write down the Page from the Edit file.
                        // It's like the above case, except we don't tell
                        // the future-edits to delete anything.
                        //
                        // 1.  Write the item down for the future.
                        pagesByMD5Edits++;

                        //
                        // If this is an ADD_PAGE_IFN_PRESENT, then
                        // we may also have a Link we have to take care of!
                        //
                        if (curInstruction == ADD_PAGE_IFN_PRESENT) {
                            Link editLink = editItem.getLink();
                            if (editLink != null) {
                                addLink(editLink);
                            }
                        }
                        piwriter.appendInstructionInfo(futureEdits, editItem.getPage(), ADD_PAGE, NullWritable.get());

                        //
                        // 2.  Write the edit-page to *this* table
                        newDb.append(editItem.getPage().getURL(), editItem.getPage());
                        itemsWritten++;

                        // Process the edit
                        hasEdits = edits.next(editItem);
                    }
                } else if (curInstruction == DEL_PAGE) {
                    if (comparison < 0) {
                        // Write the readerKey, just passing it along.
                        // We don't process the edit yet.
                        newDb.append(readerKey, readerVal);
                        itemsWritten++;
                        hasEntries = db.next(readerKey, readerVal);
                    } else if (comparison == 0) {
                        // Delete it!  We can only delete one item
                        // at a time, as all URLs are unique.
                        // 1.  Tell the future-edits what page will need to
                        //     be deleted.
                        pagesByMD5Edits++;
                        piwriter.appendInstructionInfo(futureEdits, readerVal, DEL_PAGE, NullWritable.get());

                        //
                        // 2.  "Delete" the entry by skipping the Reader
                        //     key.
                        hasEntries = db.next(readerKey, readerVal);

                        // Process the edit
                        hasEdits = edits.next(editItem);
                    } else if (comparison > 0) {
                        // Ignore it.  We tried to delete an item that's
                        // not here.
                        hasEdits = edits.next(editItem);
                    }
                }
            }

            // Now we have only edits.  No more preexisting items!
            while (! hasEntries && hasEdits) {
                int curInstruction = editItem.getInstruction();
                if (curInstruction == ADD_PAGE ||
                    curInstruction == ADD_PAGE_WITH_SCORE ||
                    curInstruction == ADD_PAGE_IFN_PRESENT) {
                    // No more reader entries, so ADD_PAGE_IFN_PRESENT
                    // is treated like a simple ADD_PAGE.

                    // 1.  Tell the future edits-list about this new item
                    pagesByMD5Edits++;
                    
                    //
                    // If this is an ADD_PAGE_IFN_PRESENT, then
                    // we may also have a Link we have to take care of!
                    //
                    if (curInstruction == ADD_PAGE_IFN_PRESENT) {
                        Link editLink = editItem.getLink();
                        if (editLink != null) {
                            addLink(editLink);
                        }
                    }
                    piwriter.appendInstructionInfo(futureEdits, editItem.getPage(), ADD_PAGE, NullWritable.get());

                    // 2.  Write the edit page to this table.
                    newDb.append(editItem.getPage().getURL(), editItem.getPage());
                    itemsWritten++;
                } else if (curInstruction == DEL_PAGE) {
                    // Ignore it.  We tried to delete an item
                    // that's not here.
                }

                // Either way, we always process the edit.
                hasEdits = edits.next(editItem);
            }

            // Now we have only preexisting items.  We just copy
            // them to the new file, in order.
            while (hasEntries && ! hasEdits) {
                newDb.append(readerKey, readerVal);
                itemsWritten++;
                hasEntries = db.next(readerKey, readerVal);
            }
        }
    }

    /***
     * The PagesByMD5Processor is used during close() time for
     * the pagesByMD5 table.  We instantiate one of these, and it
     * takes care of the entire shutdown process.
     */
    private class PagesByMD5Processor extends CloseProcessor {
        /**
         */
        PagesByMD5Processor(MapFile.Reader db, SequenceFile.Writer editWriter) {
            super(PAGES_BY_MD5, db, editWriter, new SequenceFile.Sorter(new PageInstruction.PageComparator(), NullWritable.class), null, Page.class, NullWritable.class);
        }

        /**
         */
        void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException {
            // Create the keys and vals
            Page readerItem = new Page();
            PageInstruction editItem = new PageInstruction();

            // For computing the GC list
            Page deletedItem = new Page(), lastItem = new Page();
            boolean justDeletedItem = false;
            boolean newReaderItem = false;
            int itemRepeats = 0;

            // Read the first items from both streams
            boolean hasEntries = db.next(readerItem, NullWritable.get());
            boolean hasEdits = sortedEdits.next(editItem, NullWritable.get());
            if (hasEntries) {
                // The first thing we read should become
                // the "previous key".  We need this for
                // garbage collection.
                outBuf.reset();
                readerItem.write(outBuf);
                inBuf.reset(outBuf.getData(), outBuf.getLength());
                lastItem.readFields(inBuf);
                itemRepeats = 0;
            }

            // As long we have both edits and entries, we need to
            // interleave them.
            while (hasEdits && hasEntries) {
                int comparison = readerItem.compareTo(editItem.getPage());
                int curInstruction = editItem.getInstruction();

                //
                // OK!  Now perform operations
                //
                if (curInstruction == ADD_PAGE) {
                    if (comparison < 0) {
                        // Write readerItem, just passing it along.
                        // Don't process the edit yet.
                        newDb.append(readerItem, NullWritable.get());
                        itemsWritten++;
                        hasEntries = db.next(readerItem, NullWritable.get());
                        newReaderItem = true;
                    } else if (comparison == 0) {
                        //
                        // This is a "replacing ADD", which is generated
                        // by the above-sequence.  We should skip over the
                        // existing item, and add the new one instead.
                        //
                        // Note that by this point, the new version of the
                        // Page from the edit sequence is guaranteed to
                        // have the correct score.  We make sure of it in
                        // the mergeEdits() for PagesByURLProcessor.
                        //
                        newDb.append(editItem.getPage(), NullWritable.get());
                        itemsWritten++;
                        hasEntries = db.next(readerItem, NullWritable.get());
                        newReaderItem = true;
                        hasEdits = sortedEdits.next(editItem, NullWritable.get());
                    } else if (comparison > 0) {
                        // Write the edit item.  We've inserted an item
                        // that comes before any others.
                        newDb.append(editItem.getPage(), NullWritable.get());
                        itemsWritten++;
                        hasEdits = sortedEdits.next(editItem, NullWritable.get());
                    }
                } else if (curInstruction == ADD_PAGE_IFN_PRESENT) {
                    throw new IOException("Should never process ADD_PAGE_IFN_PRESENT for the index:  " + editItem);
                } else if (curInstruction == DEL_PAGE) {
                    if (comparison < 0) {
                        // Write the readerKey, just passing it along.
                        // Don't process the edit yet.
                        newDb.append(readerItem, NullWritable.get());
                        itemsWritten++;
                        hasEntries = db.next(readerItem, NullWritable.get());
                        newReaderItem = true;
                    } else if (comparison == 0) {
                        // Delete it!  Remember only one entry can
                        // be deleted at a time!
                        //
                        // "Delete" the entry by skipping over the reader
                        // item.  We move onto the next item in the existing
                        // index, as well as the next edit instruction.
                        hasEntries = db.next(readerItem, NullWritable.get());
                        newReaderItem = true;
                        hasEdits = sortedEdits.next(editItem, NullWritable.get());

                        // We need to set this flag for GC'ing.
                        justDeletedItem = true;
                    } else if (comparison > 0) {
                        // This should never happen!  We should only be
                        // deleting items that actually appear!
                        throw new IOException("An unapplicable DEL_PAGE should never appear during index-merge: " + editItem);
                    }
                }

                // GARBAGE COLLECTION
                // We want to detect when we have deleted the 
                // last MD5 of a certain value.  We can have 
                // multiple MD5s in the same index, as long as
                // they have different URLs.  When the last MD5
                // is deleted, we want to know so we can modify
                // the LinkDB.
                if (newReaderItem) {
                    // If we have a different readerItem which is just
                    // the same as our last one, then we know it's a 
                    // repeat!
                    if (hasEntries && readerItem.getMD5().compareTo(lastItem.getMD5()) == 0) {
                        itemRepeats++;
                    } else {
                        // The current readerItem and the lastItem
                        // MD5s are not equal.
                        //
                        // If the last item was deleted, AND if the
                        // deleted item is not a repeat of the current item,
                        // then that MD5 should be garbage collected.
                        if (justDeletedItem && itemRepeats == 0) {
                            deleteLink(lastItem.getMD5());
                        }

                        // The current readerItem is the new "last key".
                        outBuf.reset();
                        readerItem.write(outBuf);
                        inBuf.reset(outBuf.getData(), outBuf.getLength());
                        lastItem.readFields(inBuf);
                        itemRepeats = 0;
                    }
                    // Clear "new-reader-item" bit
                    newReaderItem = false;
                }
                // Clear "last-deleted" bit
                justDeletedItem = false;
            }
        
            // Now we have only edits.  No more preexisting items!
            while (! hasEntries && hasEdits) {
                int curInstruction = editItem.getInstruction();
                if (curInstruction == ADD_PAGE) {
                    // Just write down the new page!
                    newDb.append(editItem.getPage(), NullWritable.get());
                    itemsWritten++;
                } else if (curInstruction == ADD_PAGE_IFN_PRESENT) {
                    throw new IOException("Should never process ADD_PAGE_IFN_PRESENT for the index:  " + editItem);
                } else if (curInstruction == DEL_PAGE) {
                    // This should never happen!  We should only be
                    // deleting items that actually appear!
                    throw new IOException("An unapplicable DEL_PAGE should never appear during index-merge: " + editItem);
                }
                hasEdits = sortedEdits.next(editItem, NullWritable.get());
            }

            // Now we have only preexisting items.  We just copy them
            // to the new file, in order
            while (hasEntries && ! hasEdits) {
                // Simply copy through the remaining database items
                newDb.append(readerItem, NullWritable.get());
                itemsWritten++;
                hasEntries = db.next(readerItem, NullWritable.get());
                newReaderItem = true;
            }
        }
    }

    /**
     * The LinksByMD5Processor is used during close() for
     * the pagesByMD5 table.  It processes all the edits to
     * this table, and also generates edits for the linksByURL
     * table.
     */
    private class LinksByMD5Processor extends CloseProcessor {
        SequenceFile.Writer futureEdits;

        /**
         */
        public LinksByMD5Processor(MapFile.Reader db, SequenceFile.Writer editWriter, SequenceFile.Writer futureEdits) {
            super(LINKS_BY_MD5, db, editWriter, new SequenceFile.Sorter(new LinkInstruction.MD5Comparator(), NullWritable.class), new Link.MD5Comparator(), Link.class, NullWritable.class);
            this.futureEdits = futureEdits;
        }

        /**
         * Merges edits into the md5-driven link table.  Also generates
         * edit sequence to apply to the URL-driven table.
         */
        void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException {
            WritableComparator comparator = new Link.MD5Comparator();
            DeduplicatingLinkSequenceReader edits = new DeduplicatingLinkSequenceReader(sortedEdits);

            // Create the keys and vals we'll use
            LinkInstruction editItem = new LinkInstruction();
            Link readerItem = new Link();

            // Read the first items from both streams
            boolean hasEntries = db.next(readerItem, NullWritable.get());
            boolean hasEdits = edits.next(editItem);

            // As long as we have both edits and entries to process,
            // we need to interleave them
            while (hasEntries && hasEdits) {
                int curInstruction = editItem.getInstruction();

                // Perform operations
                if (curInstruction == ADD_LINK) {
                    // When we add a link, we may replace a previous
                    //   link with identical URL and MD5 values.  The 
                    //   MD5FirstComparator will use both values.
                    //
                    int comparison = comparator.compare(readerItem, editItem.getLink());

                    if (comparison < 0) {
                        // Write the readerKey, just passing it along.
                        //   Don't process the edit yet.
                        newDb.append(readerItem, NullWritable.get());
                        itemsWritten++;
                        hasEntries = db.next(readerItem, NullWritable.get());
                    } else if (comparison == 0) {
                        // 1.  Write down the item for table-edits
                        if (futureEdits != null) {
                            linksByURLEdits++;
                            liwriter.appendInstructionInfo(futureEdits, editItem.getLink(), ADD_LINK, NullWritable.get());
                        }

                        // 2.  Write the new item, "replacing" the old one.
                        //    We move to the next edit instruction and move
                        //    past the replaced db entry.
                        newDb.append(editItem.getLink(), NullWritable.get());
                        itemsWritten++;
                        hasEntries = db.next(readerItem, NullWritable.get());
                        hasEdits = edits.next(editItem);
                    } else if (comparison > 0) {
                        // 1.  Write down the item for table-edits
                        if (futureEdits != null) {
                            linksByURLEdits++;
                            liwriter.appendInstructionInfo(futureEdits, editItem.getLink(), ADD_LINK, NullWritable.get());
                        }

                        // 2.  Write the new item.  We stay at the current
                        //     db entry.
                        newDb.append(editItem.getLink(), NullWritable.get());
                        itemsWritten++;
                        hasEdits = edits.next(editItem);
                    }
                } else if ((curInstruction == DEL_LINK) ||
                           (curInstruction == DEL_SINGLE_LINK)) {
                    // When we delete a link, we might delete many
                    //   at once!  We are interested only in the MD5
                    //   here.  If there are entries with identical MD5
                    //   values, but different URLs, we get rid of them
                    //   all.
                    int comparison = 0;
                    if (curInstruction == DEL_LINK) {
                        comparison = readerItem.getFromID().compareTo(editItem.getLink().getFromID());
                    } else {
                        comparison = readerItem.md5Compare(editItem.getLink());
                    }

                    if (comparison < 0) {
                        // Write the readerKey, just passing it along.
                        //   Don't process the edit yet.
                        newDb.append(readerItem, NullWritable.get());
                        itemsWritten++;
                        hasEntries = db.next(readerItem, NullWritable.get());
                    } else if (comparison == 0) {
                        // Delete it (or them!)
                        // 1. Write the full instruction for the next
                        //    delete-stage.  That includes the read-in
                        //    value
                        // 2. "Delete" the entry by skipping the
                        //    readerKey.  We DO NOT go to the next edit 
                        //    instruction!  There might still be more 
                        //    entries in the database to which we should
                        //    apply this delete-edit.
                        //
                        // Step 1.  Write entry for future table-edits
                        if (futureEdits != null) {
                            linksByURLEdits++;
                            liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_LINK, NullWritable.get());
                        }

                        // Step 2.
                        // We might want to delete multiple MD5s with
                        // a single delete() operation, so keep this
                        // edit instruction around
                        hasEntries = db.next(readerItem, NullWritable.get());
                        if (curInstruction == DEL_SINGLE_LINK) {
                            hasEdits = edits.next(editItem);
                        }
                    } else if (comparison > 0) {
                        // Ignore, move on to next instruction
                        hasEdits = edits.next(editItem);
                    }
                }
            }

            // Now we have only edits.  No more preexisting items!
            while (! hasEntries && hasEdits) {
                int curInstruction = editItem.getInstruction();

                if (curInstruction == ADD_LINK) {
                    // 1.  Write down the item for future table-edits
                    if (futureEdits != null) {
                        linksByURLEdits++;
                        liwriter.appendInstructionInfo(futureEdits, editItem.getLink(), ADD_LINK, NullWritable.get());
                    }

                    // 2.  Just add the item from the edit list
                    newDb.append(editItem.getLink(), NullWritable.get());
                    itemsWritten++;
                } else if (curInstruction == DEL_LINK) {
                    // Ignore operation
                }
                // Move on to next edit
                hasEdits = edits.next(editItem);
            }

            // Now we have only preexisting items.  Just copy them
            // to the new file, in order.
            while (hasEntries && ! hasEdits) {
                newDb.append(readerItem, NullWritable.get());
                itemsWritten++;
                hasEntries = db.next(readerItem, NullWritable.get());
            }
        }
    }

    /**
     * This class helps the LinksByURLProcessor test a list of
     * Page objects, sorted by URL, for outlink-counts.  We query
     * this class with a series of questions, based on Links sorted
     * by target URL.
     */
    private class TargetTester {
        MapFile.Reader pagedb;
        boolean hasPage = false;
        UTF8 pageURL = null;
        Page page = null;

        /**
         */
        public TargetTester(MapFile.Reader pagedb) throws IOException {
            this.pagedb = pagedb;
            this.pageURL = new UTF8();
            this.page = new Page();
            this.hasPage = pagedb.next(pageURL, page);
        }

        /**
         * Match the given URL against the sorted series of Page URLs.
         */
        public int hasOutlinks(UTF8 curURL) throws IOException {
            int returnCode = NO_OUTLINKS;
            int comparison = pageURL.compareTo(curURL);

            while (hasPage && comparison < 0) {
                hasPage = pagedb.next(pageURL, page);
                if (hasPage) {
                    comparison = pageURL.compareTo(curURL);
                }
            }

            if (hasPage) {
                if (comparison == 0) {
                    returnCode = (page.getNumOutlinks() > 0) ? HAS_OUTLINKS : NO_OUTLINKS;
                } else if (comparison > 0) {
                    //
                    // This situation indicates that the Link's 
                    // target page has been deleted, probably
                    // because we repeatedly failed to fetch the URL.
                    // So, we should delete the Link.
                    //
                    returnCode = LINK_INVALID;
                }
            }
            return returnCode;
        }

        /**
         */
        public void close() throws IOException {
            pagedb.close();
        }
    }

    /**
     * Closes down and merges changes to the URL-driven link
     * table.  This does nothing fancy, and propagates nothing
     * to a further stage.  There is no next stage!
     */
    private class LinksByURLProcessor extends CloseProcessor {
        MapFile.Reader pageDb;
        SequenceFile.Writer futureEdits;

        /**
         */
        public LinksByURLProcessor(MapFile.Reader db, SequenceFile.Writer editWriter, MapFile.Reader pageDb, SequenceFile.Writer futureEdits) {
            super(LINKS_BY_URL, db, editWriter, new SequenceFile.Sorter(new LinkInstruction.UrlComparator(), NullWritable.class), new Link.UrlComparator(), Link.class, NullWritable.class);
            this.pageDb = pageDb;
            this.futureEdits = futureEdits;
        }

        /**
         */
        public long closeDown(File workingDir, File outputDir, long numEdits) throws IOException {
            long result = super.closeDown(workingDir, outputDir, numEdits);
            pageDb.close();
            return result;
        }

        /**
         * Merge the existing db with the edit-stream into a brand-new file.
         */
        void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException {
            WritableComparator comparator = new Link.UrlComparator();

            // Create the keys and vals we'll use
            LinkInstruction editItem = new LinkInstruction();
            Link readerItem = new Link();
        
            // Read the first items from both streams
            boolean hasEntries = db.next(readerItem, NullWritable.get());
            boolean hasEdits = sortedEdits.next(editItem, NullWritable.get());
            TargetTester targetTester = new TargetTester(pageDb);

            // As long as we have both edits and entries to process,
            // we need to interleave them
            while (hasEntries && hasEdits) {
                int curInstruction = editItem.getInstruction();

                if (curInstruction == ADD_LINK) {
                    //  When we add a link, we may replace a previous
                    //    link with identical URL and MD5 values.  Our
                    //    comparator will test both
                    //
                    int comparison = comparator.compare(readerItem, editItem.getLink());

                    if (comparison < 0) {
                        // Write the readerKey, just passing it along.
                        // Don't process the edit yet.
                        int linkTest = targetTester.hasOutlinks(readerItem.getURL());

                        if (linkTest == LINK_INVALID) {
                            liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_SINGLE_LINK, NullWritable.get());
                            targetOutlinkEdits++;
                        } else {
                            boolean oldOutlinkStatus = readerItem.targetHasOutlink();
                            boolean newOutlinkStatus = (linkTest == HAS_OUTLINKS);
                            // Do the conditional so we minimize unnecessary 
                            // mod-writes.
                            if (oldOutlinkStatus != newOutlinkStatus) {
                                readerItem.setTargetHasOutlink(newOutlinkStatus);
                                liwriter.appendInstructionInfo(futureEdits, readerItem, ADD_LINK, NullWritable.get());
                                targetOutlinkEdits++;
                            }
                            newDb.append(readerItem, NullWritable.get());
                            itemsWritten++;
                        }
                        hasEntries = db.next(readerItem, NullWritable.get());
                    } else if (comparison == 0) {
                        // Write the new item, "replacing" the old one.
                        // We move to the next edit instruction and move
                        //    past the replaced db entry.
                        Link editLink = editItem.getLink();
                        int linkTest = targetTester.hasOutlinks(editLink.getURL());

                        // Delete the edit/readerItem from the other table if it's
                        // found to be invalid.
                        if (linkTest == LINK_INVALID) {
                            liwriter.appendInstructionInfo(futureEdits, editLink, DEL_SINGLE_LINK, NullWritable.get());
                        } else {
                            editLink.setTargetHasOutlink(linkTest == HAS_OUTLINKS);
                            liwriter.appendInstructionInfo(futureEdits, editLink, ADD_LINK, NullWritable.get());

                            newDb.append(editLink, NullWritable.get());
                            itemsWritten++;
                        }
                        targetOutlinkEdits++;

                        hasEntries = db.next(readerItem, NullWritable.get());
                        hasEdits = sortedEdits.next(editItem, NullWritable.get());
                    } else if (comparison > 0) {
                        // Write the new item.  We stay at the current
                        // db entry.
                        Link editLink = editItem.getLink();
                        int linkTest = targetTester.hasOutlinks(editLink.getURL());

                        // Delete the edit from the other table if it's invalid
                        if (linkTest == LINK_INVALID) {
                            liwriter.appendInstructionInfo(futureEdits, editLink, DEL_SINGLE_LINK, NullWritable.get());
                        } else {
                            editLink.setTargetHasOutlink(linkTest == HAS_OUTLINKS);
                            liwriter.appendInstructionInfo(futureEdits, editLink, ADD_LINK, NullWritable.get());
                            newDb.append(editLink, NullWritable.get());
                            itemsWritten++;
                        }
                        targetOutlinkEdits++;

                        hasEdits = sortedEdits.next(editItem, NullWritable.get());
                    }
                } else if (curInstruction == DEL_LINK) {
                    // When we delete a link, we do it by MD5 and apply
                    //   it to the index first.  A single delete instruction
                    //   may remove many items in the db, during the earlier
                    //   processing.  However, unlike the index-processing stage,
                    //   here we can expect a new DEL instruction for every 
                    //   item that we remove from the db.
                    //
                    int comparison = comparator.compare(readerItem, editItem.getLink());

                    if (comparison < 0) {
                        // Write readerKey, just passing it along.  Don't
                        //   process the edit yet.
                        int linkTest = targetTester.hasOutlinks(readerItem.getURL());

                        // Delete the reader item if it's found to be invalid
                        if (linkTest == LINK_INVALID) {
                            liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_SINGLE_LINK, NullWritable.get());
                        } else {
                            readerItem.setTargetHasOutlink(linkTest == HAS_OUTLINKS);
                            liwriter.appendInstructionInfo(futureEdits, readerItem, ADD_LINK, NullWritable.get());
                            newDb.append(readerItem, NullWritable.get());
                            itemsWritten++;
                        }
                        targetOutlinkEdits++;

                        hasEntries = db.next(readerItem, NullWritable.get());
                    } else if (comparison == 0) {
                        // "Delete" the item by passing by the readerKey.
                        // We want a new entry, as well as the next instruction
                        // to process.
                        hasEntries = db.next(readerItem, NullWritable.get());
                        hasEdits = sortedEdits.next(editItem, NullWritable.get());
                    } else if (comparison > 0) {
                        // Ignore, move on to next instruction
                        hasEdits = sortedEdits.next(editItem, NullWritable.get());
                    }
                }
            }

            // Now we have only edits.  No more preexisting items!
            while (! hasEntries && hasEdits) {
                int curInstruction = editItem.getInstruction();

                if (curInstruction == ADD_LINK) {
                    //
                    // Add the item from the edit list.
                    //

                    //
                    // Make sure the outlinks flag is set properly.
                    //
                    Link editLink = editItem.getLink();
                    int linkTest = targetTester.hasOutlinks(editLink.getURL());
                    if (linkTest == LINK_INVALID) {
                        liwriter.appendInstructionInfo(futureEdits, editLink, DEL_SINGLE_LINK, NullWritable.get());
                    } else {
                        editLink.setTargetHasOutlink(linkTest == HAS_OUTLINKS);
                        liwriter.appendInstructionInfo(futureEdits, editLink, ADD_LINK, NullWritable.get());
                        newDb.append(editLink, NullWritable.get());
                        itemsWritten++;
                    }
                    targetOutlinkEdits++;
                } else if (curInstruction == DEL_LINK) {
                    // Ignore operation
                }
                // Move on to next edit
                hasEdits = sortedEdits.next(editItem, NullWritable.get());
            }

            // Now we have only preexisting items.  Just copy them
            // to the new file, in order.
            while (hasEntries && ! hasEdits) {
                //
                // Simply copy the remaining database items.
                //

                //
                // First, make sure the 'outlinks' flag is set properly.
                //
                int linkTest = targetTester.hasOutlinks(readerItem.getURL());
                if (linkTest == LINK_INVALID) {
                    liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_SINGLE_LINK, NullWritable.get());
                    targetOutlinkEdits++;
                } else {
                    boolean oldOutlinkStatus = readerItem.targetHasOutlink();
                    boolean newOutlinkStatus = (linkTest == HAS_OUTLINKS);
                    if (oldOutlinkStatus != newOutlinkStatus) {
                        readerItem.setTargetHasOutlink(newOutlinkStatus);
                        liwriter.appendInstructionInfo(futureEdits, readerItem, ADD_LINK, NullWritable.get());
                        targetOutlinkEdits++;
                    }

                    // Now copy the object
                    newDb.append(readerItem, NullWritable.get());
                    itemsWritten++;
                }

                // Move on to next
                hasEntries = db.next(readerItem, NullWritable.get());
            }

            targetTester.close();
        }
    }

    /**
     * Create the WebDB for the first time.
     */
    public static void createWebDB(File dir) throws IOException {
        WebDBWriter starter = new WebDBWriter(dir, true);
        starter.close();
    }
    
    boolean haveEdits = false;
    File dbFile, oldDbFile, newDbFile, tmp;
    MapFile.Reader pagesByURL, pagesByMD5, linksByURL, linksByMD5;
    SequenceFile.Writer pagesByURLWriter, pagesByMD5Writer, linksByURLWriter, linksByMD5Writer;
    long pagesByURLEdits = 0, pagesByMD5Edits = 0, linksByURLEdits = 0, linksByMD5Edits = 0, targetOutlinkEdits = 0;
    PageInstructionWriter piwriter = new PageInstructionWriter();
    LinkInstructionWriter liwriter = new LinkInstructionWriter();
    DataInputBuffer inBuf = new DataInputBuffer();
    DataOutputBuffer outBuf = new DataOutputBuffer();
    FileOutputStream dbReadLockData, dbWriteLockData;
    FileLock dbWriteLock;

    /**
     * Create a WebDBWriter.
     */
    public WebDBWriter(File dir) throws IOException {
        this(dir, false);
    }

    /**
     * Private constructor, so we can either open or create the db files.
     */
    private WebDBWriter(File dir, boolean create) throws IOException {
        this.dbFile = new File(dir, "webdb");
        this.oldDbFile = new File(dir, "webdb.old");
        this.newDbFile = new File(dir, "webdb.new");
        this.tmp = new File(newDbFile, "tmp");

        if ((! dir.exists()) && create) {
            dir.mkdirs();
        }

        // Lock the writeLock immediately.
        File writeLockFile = new File(dir, "dbwritelock");
        writeLockFile.createNewFile();
        this.dbWriteLockData = new FileOutputStream(writeLockFile);
        this.dbWriteLock = dbWriteLockData.getChannel().lock(0L, Long.MAX_VALUE, false);

        // build a FileChannel object for readLocking later on
        File readLockFile = new File(dir, "dbreadlock");
        readLockFile.createNewFile();
        this.dbReadLockData = new FileOutputStream(readLockFile);

        // Resolve any partial-state dirs from the last run.
        if (oldDbFile.exists()) {
            if (dbFile.exists()) {
                throw new IOException("Impossible condition: directories " + oldDbFile + " and " + dbFile + " cannot exist simultaneously");
            }
            if (newDbFile.exists()) {
                newDbFile.renameTo(dbFile);
            }
            FileUtil.fullyDelete(oldDbFile);
        } else if (newDbFile.exists()) {
            FileUtil.fullyDelete(newDbFile);
        }

        // Create the directory, if necessary
        if ((! dbFile.exists()) && create) {
            dbFile.mkdirs();
        }

        // Delete any partial edits from last time.
        if (tmp.exists()) {
            FileUtil.fullyDelete(tmp);
        }
        tmp.mkdirs();

        // Create the file names we need
        if (create) {
            new MapFile.Writer(new File(dbFile, PAGES_BY_URL).getPath(), new UTF8.Comparator(), Page.class).close();
            new MapFile.Writer(new File(dbFile, PAGES_BY_MD5).getPath(), new Page.Comparator(), NullWritable.class).close();
            new MapFile.Writer(new File(dbFile, LINKS_BY_URL).getPath(), new Link.UrlComparator(), NullWritable.class).close();
            new MapFile.Writer(new File(dbFile, LINKS_BY_MD5).getPath(), new Link.MD5Comparator(), NullWritable.class).close();
        }

        // Create the Readers for those files
        this.pagesByURL = new MapFile.Reader(new File(dbFile, PAGES_BY_URL).getPath(), new UTF8.Comparator());
        this.pagesByMD5 = new MapFile.Reader(new File(dbFile, PAGES_BY_MD5).getPath(), new Page.Comparator());
        this.linksByURL = new MapFile.Reader(new File(dbFile, LINKS_BY_URL).getPath(), new Link.UrlComparator());
        this.linksByMD5 = new MapFile.Reader(new File(dbFile, LINKS_BY_MD5).getPath(), new Link.MD5Comparator());

        // Create writers for new edit-files.  We write changes
        // into these files, then apply them to the db upon close().
        pagesByURLWriter = new SequenceFile.Writer(new File(tmp, PAGES_BY_URL + ".out").getPath(), PageInstruction.class, NullWritable.class);
        pagesByMD5Writer = new SequenceFile.Writer(new File(tmp, PAGES_BY_MD5 + ".out").getPath(), PageInstruction.class, NullWritable.class);
        linksByURLWriter = new SequenceFile.Writer(new File(tmp, LINKS_BY_URL + ".out").getPath(), LinkInstruction.class, NullWritable.class);
        linksByMD5Writer = new SequenceFile.Writer(new File(tmp, LINKS_BY_MD5 + ".out").getPath(), LinkInstruction.class, NullWritable.class);
    }

    /**
     * Shutdown
     */
    public synchronized void close() throws IOException {
        if (haveEdits) {
            newDbFile.mkdirs();

            // Process the 4 tables:
            // 1. pagesByURL
            // 2. pagesByMD5
            // 3. linksByMD5
            // 4. linksByURL

            // 1. Process pagesByURL.  Processing this stream will
            // generate a number of edits for the pagesByMD5 step.
            //
            CloseProcessor pagesByURLProcessor = new PagesByURLProcessor(pagesByURL, pagesByURLWriter, pagesByMD5Writer);
            long numPBUItems = pagesByURLProcessor.closeDown(tmp, newDbFile, pagesByURLEdits);

            //
            // 2.  Process the pagesByMD5 edit stream.  This will
            // make calls to deleteLink(), which are processed later.
            //
            CloseProcessor pagesByMD5Processor = new PagesByMD5Processor(pagesByMD5, pagesByMD5Writer);
            long numPBMItems = pagesByMD5Processor.closeDown(tmp, newDbFile, pagesByMD5Edits);

            //
            // 3. Process the linksByMD5 edit stream first.  This
            // will generate a number of edits for the linksByURL
            // stream.  This also processes the calls to deleteLink()
            // that may have been invoked as part of the above call
            // to process pagesByMD5.
            CloseProcessor linksByMD5Processor = new LinksByMD5Processor(linksByMD5, linksByMD5Writer, linksByURLWriter);
            long numLBMItems = linksByMD5Processor.closeDown(tmp, newDbFile, linksByMD5Edits);

            //
            // 4. Process the linksByURL edit stream.  This will also
            // read through the sorted PagesByURL file, and modify
            // the Links so that they indicated whether the target
            // Page has any outlinks or not.
            //
            SequenceFile.Writer targetOutlinkEditsWriter = new SequenceFile.Writer(new File(tmp, LINKS_BY_MD5 + ".out").getPath(), LinkInstruction.class, NullWritable.class);
            CloseProcessor linksByURLProcessor = new LinksByURLProcessor(linksByURL, linksByURLWriter, new MapFile.Reader(new File(newDbFile, PAGES_BY_URL).getPath(), new UTF8.Comparator()), targetOutlinkEditsWriter);
            long numLBUItems = linksByURLProcessor.closeDown(tmp, newDbFile, linksByURLEdits);

            //
            // If the number of linksByURL processed is zero, then
            // there's no reason to do all of the following with
            // a 2nd pass through linksByMD5.
            //
            if (numLBUItems == 0) {
                targetOutlinkEditsWriter.close();

                //
                // Need to load in the previous number of links, so we
                // don't write over with the wrong value.
                //
                File stats = new File(dbFile, STATS_FILE);
                if (stats.exists()) {
                    DataInputStream in = new DataInputStream(new FileInputStream(stats));
                    try {
                        in.read();                   // version
                        in.readLong();               // previous num of pages
                        numLBMItems = in.readLong(); // previous num of links
                    } finally {
                        in.close();
                    }
                }
            } else {
                //
                // 5. Step 4 did several things to the LinksByURL db.
                // First, it implemented all the changes generated
                // by instructions from LinksByMD5Processor.  Second,
                // it made lots of calls to setTargetHasOutlink.  This
                // changes the content of the Link objects.
                //
                // So now we need to reconstruct the LinksByMD5
                // list, using the Links we created in step #4.
                //

                File stageTwoDbFile = new File(newDbFile, "stage2.subdir");
                stageTwoDbFile.mkdir();

                MapFile.Reader linksByMD5ForStageTwo = new MapFile.Reader(new File(newDbFile, LINKS_BY_MD5).getPath(), new Link.MD5Comparator());
                CloseProcessor linksByMD5StageTwoProcessor = new LinksByMD5Processor(linksByMD5ForStageTwo, targetOutlinkEditsWriter, null);
                numLBMItems = linksByMD5StageTwoProcessor.closeDown(tmp, stageTwoDbFile, targetOutlinkEdits);

                //
                // 6. Now move the Stage2 LinksByMD5 file up to replace
                // the one at the primary level
                //
                linksByMD5ForStageTwo.close();
                File stageOneLinksByMD5 = new File(newDbFile, LINKS_BY_MD5);
                FileUtil.fullyDelete(stageOneLinksByMD5);
                new File(stageTwoDbFile, LINKS_BY_MD5).renameTo(stageOneLinksByMD5);
                FileUtil.fullyDelete(stageTwoDbFile);
            }

            //
            // 7. Finally, write out the total num of pages and links
            //
            File stats = new File(newDbFile, STATS_FILE);
            DataOutputStream out = new DataOutputStream(new FileOutputStream(stats));
            try {
                //
                // These counts are guaranteed to be correct; they're
                // based on the counts made during processing of primary-key
                // edits.  Pages are always counted by URL first, and only
                // subsequently by MD5 if there are any edits to make.  Links
                // are always counted by MD5 first, and only by URL subsequently
                // and conditionally.  
                //
                // If there are a bunch of edits that result in no modifications
                // to the db, the two sets of counts (one for URL, one for
                // MD5) could become out of sync.  So we use the ones that
                // are sure to be accurate.
                //
                out.write(CUR_VERSION);
                out.writeLong(numPBUItems);
                out.writeLong(numLBMItems);
            } finally {
                out.close();
            }
        } else {
            pagesByURLWriter.close();
            pagesByMD5Writer.close();
            linksByMD5Writer.close();
            linksByURLWriter.close();
        }

        // Close down the db-readers
        pagesByURL.close();
        pagesByMD5.close();
        linksByMD5.close();
        linksByURL.close();

        // Delete the edits directory.
        FileUtil.fullyDelete(tmp);

        // Before we can move the newdb into place over the
        // old one, we need to make sure there are no processes
        // reading the old db.  This obtains an exclusive lock
        // on the directory that holds our dbs (old and new).
        FileLock dbReadLock = dbReadLockData.getChannel().lock(0L, Long.MAX_VALUE, false);

        // We're done!  Now we rename the directories and
        // all is well.
        if (haveEdits) {
            dbFile.renameTo(oldDbFile);
            newDbFile.renameTo(dbFile);
            FileUtil.fullyDelete(oldDbFile);
        } else {
            // Sometimes the "newdb" is created as a side-effect
            // of creating the tmp dir, even when there are no edits.
            // Get rid of it.
            FileUtil.fullyDelete(newDbFile);
        }
        
        // release the readlock
        dbReadLock.release();
        dbReadLockData.close();

        // release the writelock
        dbWriteLock.release();
        dbWriteLockData.close();
    }

    /////////////////////
    // Methods for adding, and managing, db operations
    ////////////////////

    /**
     * Add a page to the page database
     */
    public synchronized void addPage(Page page) throws IOException {
        // The 2nd (byMD5) part is handled during processing of the 1st.
        haveEdits = true;
        pagesByURLEdits++;
        piwriter.appendInstructionInfo(pagesByURLWriter, page, ADD_PAGE, NullWritable.get());
    }

    /**
     * Add a page to the page database, with a brand-new score
     */
    public synchronized void addPageWithScore(Page page) throws IOException {
        // The 2nd (byMD5) part is handled during processing of the 1st.
        haveEdits = true;
        pagesByURLEdits++;
        piwriter.appendInstructionInfo(pagesByURLWriter, page, ADD_PAGE_WITH_SCORE, NullWritable.get());
    }

    /**
     * Don't replace the one in the database, if there is one.
     */
    public synchronized void addPageIfNotPresent(Page page) throws IOException {
        // The 2nd (index) part is handled during processing of the 1st.
        haveEdits = true;
        pagesByURLEdits++;
        piwriter.appendInstructionInfo(pagesByURLWriter, page, ADD_PAGE_IFN_PRESENT, NullWritable.get());        
    }

    /**
     * Don't replace the one in the database, if there is one.
     *
     * If we do insert the new Page, then we should also insert
     * the given Link object.
     */
    public synchronized void addPageIfNotPresent(Page page, Link link) throws IOException {
        // The 2nd (index) part is handled during processing of the 1st.
        haveEdits = true;
        pagesByURLEdits++;
        piwriter.appendInstructionInfo(pagesByURLWriter, page, link, ADD_PAGE_IFN_PRESENT, NullWritable.get());        
    }

    /**
     * Remove a page from the page database.
     */
    public synchronized void deletePage(String url) throws IOException {
        // The 2nd (index) part is handled during processing of the 1st.
        haveEdits = true;
        Page p = new Page();
        p.setURL(url);
        pagesByURLEdits++;        
        piwriter.appendInstructionInfo(pagesByURLWriter, p, DEL_PAGE, NullWritable.get());
    }

    /**
     * Add a link to the link database
     */
    public synchronized void addLink(Link lr) throws IOException {
        haveEdits = true;
        linksByMD5Edits++;
        liwriter.appendInstructionInfo(linksByMD5Writer, lr, ADD_LINK, NullWritable.get());
    }

    /**
     * Remove links with the given MD5 from the db.
     */
    public synchronized void deleteLink(MD5Hash md5) throws IOException {
        haveEdits = true;
        linksByMD5Edits++;
        liwriter.appendInstructionInfo(linksByMD5Writer, new Link(md5, 0, "", ""), DEL_LINK, NullWritable.get());
    }

    /**
     * The WebDBWriter.main() provides some handy methods for
     * testing the WebDB.
     */
    public static void main(String argv[]) throws FileNotFoundException, IOException {
        if (argv.length < 2) {
            System.out.println("Usage: java net.nutch.db.WebDBWriter <db> [-create] | [-addpage id url] | [-addpageifnp id url] | [-deletepage url] | [-addlink fromID url] | [-deletelink fromID]");
            return;
        }

        if ("-create".equals(argv[1])) {
            WebDBWriter.createWebDB(new File(argv[0]));
            System.out.println("Created webdb at " + argv[0]);
        } else if ("-addpage".equals(argv[1])) {
            MD5Hash md5 = new MD5Hash(argv[2]);
            String url = argv[3];

            WebDBWriter writer = new WebDBWriter(new File(argv[0]));
            try {
                Page page = new Page(url, md5);
                writer.addPageWithScore(page);
                System.out.println("Added page (with score): " + page);
            } finally {
                writer.close();
            } 
        } else if ("-addpageifnp".equals(argv[1])) {
            MD5Hash md5 = new MD5Hash(argv[2]);
            String url = argv[3];

            WebDBWriter writer = new WebDBWriter(new File(argv[0]));
            try {
                Page page = new Page(url, md5);
                writer.addPageIfNotPresent(page);
                System.out.println("Added page: " + page);
            } finally {
                writer.close();
            } 
        } else if ("-deletepage".equals(argv[1])) {
            String url = argv[2];

            WebDBWriter writer = new WebDBWriter(new File(argv[0]));
            try {
                writer.deletePage(url.trim());
                System.out.println("Deleted item(s)");
            } finally {
                writer.close();
            }
        } else if ("-addlink".equals(argv[1])) {
            MD5Hash fromID = new MD5Hash(argv[2]);
            String url = argv[3];

            WebDBWriter writer = new WebDBWriter(new File(argv[0]));
            try {
                Link link = new Link(fromID, MD5Hash.digest("randomstring.com").halfDigest(), url, "SomeRandomAnchorText_" + System.currentTimeMillis());
                writer.addLink(link);
                System.out.println("Added link: " + link);
            } finally {
                writer.close();
            }
        } else if ("-deletelink".equals(argv[1])) {
            MD5Hash fromID = new MD5Hash(argv[2]);

            WebDBWriter writer = new WebDBWriter(new File(argv[0]));
            try {
                writer.deleteLink(fromID);
                System.out.println("Deleted item(s)");
            } finally {
                writer.close();
            }
        } else {
            System.out.println("Sorry, no command with name " + argv[1]);
        }
    }
}